Introduction

Abalones are one type of reef-dwelling marine snails. It is difficult to tell the ages of abalones because their shellsizes not only depend on how old they are, but also depend on the availability of food. The study of age is usually by obtaining a stained sample of the shell and looking at the number of rings through a microscope. We are interested in using some of abalones physical measurements, especially the height measurement to predict their ages. Biologists believe that a simple linear regression model with normal error assumption is appropriate to describe the relationship between the height of abalones and their ages. In particular, that a larger height is associated with an older age.

The dataset and its description are available at https://archive.ics.uci.edu/ml/datasets/Abalone.

Global libraries and parameters

Firstly, we import the necessary libraries.

library(readr)
library(carData)
library(car)
library(knitr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggfortify)
library(MASS)
library(rpart)
library(caret)
## Le chargement a nécessité le package : lattice
library(lattice)
library(foreach)
library(doParallel)
## Le chargement a nécessité le package : iterators
## Le chargement a nécessité le package : parallel
library(iterators)
library(parallel)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attachement du package : 'randomForest'
## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     margin
library(adabag)
library(dplyr)
## 
## Attachement du package : 'dplyr'
## L'objet suivant est masqué depuis 'package:randomForest':
## 
##     combine
## L'objet suivant est masqué depuis 'package:MASS':
## 
##     select
## L'objet suivant est masqué depuis 'package:car':
## 
##     recode
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xgboost)
## 
## Attachement du package : 'xgboost'
## L'objet suivant est masqué depuis 'package:dplyr':
## 
##     slice
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.6     ✔ stringr 1.4.0
## ✔ tidyr   1.1.4     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate()    masks foreach::accumulate()
## ✖ dplyr::combine()       masks randomForest::combine()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ purrr::lift()          masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ dplyr::recode()        masks car::recode()
## ✖ dplyr::select()        masks MASS::select()
## ✖ xgboost::slice()       masks dplyr::slice()
## ✖ purrr::some()          masks car::some()
## ✖ purrr::when()          masks foreach::when()
library(QBAsyDist)

Then, we set up global parameters.

set.seed(42)

Importing the data

# Read the csv file
abalone <- read.csv2("abalone_data.csv", header = T, sep = ",")

# Lowercase column names
names(abalone) <- tolower(names(abalone))
init_feat <- names(abalone)

# Data types
abalone$sex <- as.factor(abalone$sex)
abalone$whole_weight <- as.double(abalone$whole_weight)
abalone$shucked_weight <- as.double(abalone$shucked_weight)
abalone$viscera_weight <- as.double(abalone$viscera_weight)
abalone$shell_weight <- as.double(abalone$shell_weight)

# Feature engineering
abalone <- abalone[is.finite(log(abalone$height)), ]

abalone$log_length <- log(abalone$length)
abalone$log_diameter <- log(abalone$diameter)
abalone$height2 <- abalone$height^2
abalone$log_height <- log(abalone$height)
abalone$log_shucked_weight <- log(abalone$shucked_weight)
abalone$log_viscera_weight <- log(abalone$viscera_weight)
abalone$log_shell_weight <- log(abalone$shell_weight)
abalone$log_whole_weight <- log(abalone$whole_weight)
abalone$log_rings <- log(abalone$rings)
abalone$sqrt_log_rings <- sqrt(log(abalone$rings))
abalone$adulthood <- 1
abalone$adulthood[abalone$sex == "I"] <- 0

# Splitting dataset in train and test using 70/30 method
indices <- sample(seq_len(nrow(abalone)), size = 0.3 * nrow(abalone))
abalone_train <- abalone[-indices, ]
abalone_test <- abalone[indices, ]

Utils

# RSS and related values
residual_mean_of_squares <- function(y, y_pred) {
    return(mean((y - y_pred)^2))
}

residual_sum_of_squares <- function(y, y_pred) {
    return(sum((y - y_pred)^2))
}

mean_absolute_error <- function(y, y_pred) {
    return(mean(abs(y - y_pred)))
}

# Utilitary function
round_to_integer <- function(number) {
    if (number - floor(number) > 0.5) {
        return(ceiling(number))
    }
    return(floor(number))
}

# Convert predictions of models
to_rings <- function(vector) {
  as.double(lapply(vector, round_to_integer))
}

predict_rings <- function(lm, x_test) {
    return(to_rings(predict(lm, x_test)))
}

predict_error <- function(lm, x_test) {
    return(
        residual_mean_of_squares(x_test$rings, predict_rings(lm, x_test))
    )
}

log_to_rings <- function(vector) {
    vector_inversed <- exp(vector)
    as.double(lapply(vector_inversed, round_to_integer))
}

predict_log_to_rings <- function(lm, x_test, ancova = FALSE) {
    return(log_to_rings(predict(lm, x_test, ancova = ancova)))
}

predict_log_error <- function(lm, x_test, ancova = FALSE) {
    return(
      residual_mean_of_squares(x_test$rings, predict_log_to_rings(lm, x_test, ancova = ancova))
    )
}

sqrt_log_to_rings <- function(vector) {
  vector_inversed <- exp(vector^2)
  as.double(lapply(vector_inversed, round_to_integer))
}

predict_sqrt_log_to_rings <- function(lm, x_test, ancova = FALSE) {
    return(sqrt_log_to_rings(predict(lm, x_test, ancova = ancova)))
}

predict_sqrt_log_error <- function(lm, x_test, ancova = FALSE) {
    return(
      residual_mean_of_squares(x_test$rings, predict_sqrt_log_to_rings(lm, x_test, ancova = ancova))
    )
}

Parts I & II: EDA and Model validation & ANOVA/ANCOVA

Dataset description

str(abalone[, init_feat])
## 'data.frame':    4174 obs. of  9 variables:
##  $ sex           : Factor w/ 3 levels "F","I","M": 3 1 3 2 2 1 1 3 1 1 ...
##  $ length        : int  70 106 88 66 85 106 109 95 110 105 ...
##  $ diameter      : int  53 84 73 51 60 83 85 74 88 76 ...
##  $ height        : int  18 27 25 16 19 30 25 25 30 28 ...
##  $ whole_weight  : num  45.1 135.4 103.2 41 70.3 ...
##  $ shucked_weight: num  19.9 51.3 43.1 17.9 28.2 47.4 58.8 43.3 62.9 38.8 ...
##  $ viscera_weight: num  9.7 28.3 22.8 7.9 15.5 28.3 29.9 22.5 30.2 29.5 ...
##  $ shell_weight  : num  14 42 31 11 24 66 52 33 64 42 ...
##  $ rings         : int  7 9 10 7 8 20 16 9 19 14 ...

In the Abalone dataset, we have the following variables:

  • sex: factor corresponding to the sex of the snail, which can be male (M), female (F) and infant (I)
  • length: integer corresponding to the length of the shell
  • diameter: integer corresponding to the diameter of the shell, perpendicular to length
  • height: integer corresponding to the height of the meat inside the shell
  • whole_weight: double corresponding to the weight of the whole abalone
  • shucked_weight: double corresponding to the weight of the meat inside the shell
  • viscera_weight: double corresponding to the weight of the gut after bleeding
  • shell_weight: double corresponding to the weight of the shell alone
  • rings: integer corresponding to the number of rings on the shell, +1.5 gives the age of the abalone in years
summary(abalone[, init_feat])
##  sex          length         diameter          height        whole_weight   
##  F:1307   Min.   : 15.0   Min.   : 11.00   Min.   :  2.00   Min.   :  0.40  
##  I:1340   1st Qu.: 90.0   1st Qu.: 70.00   1st Qu.: 23.00   1st Qu.: 88.42  
##  M:1527   Median :109.0   Median : 85.00   Median : 28.00   Median :160.00  
##           Mean   :104.8   Mean   : 81.59   Mean   : 27.92   Mean   :165.82  
##           3rd Qu.:123.0   3rd Qu.: 96.00   3rd Qu.: 33.00   3rd Qu.:230.75  
##           Max.   :163.0   Max.   :130.00   Max.   :226.00   Max.   :565.10  
##  shucked_weight   viscera_weight    shell_weight        rings       
##  Min.   :  0.20   Min.   :  0.10   Min.   :  0.30   Min.   : 1.000  
##  1st Qu.: 37.23   1st Qu.: 18.70   1st Qu.: 26.00   1st Qu.: 8.000  
##  Median : 67.20   Median : 34.20   Median : 46.80   Median : 9.000  
##  Mean   : 71.90   Mean   : 36.13   Mean   : 47.77   Mean   : 9.934  
##  3rd Qu.:100.40   3rd Qu.: 50.60   3rd Qu.: 65.78   3rd Qu.:11.000  
##  Max.   :297.60   Max.   :152.00   Max.   :201.00   Max.   :29.000

From this summary, we can deduce that the data points that we have are evenly distributed among the sex variable. Regarding the dependent variable rings, we observe that values range from 1 to 29, which correspond to ages from 2.5 to 30.5 years. Median and mean are relatively close, with a small skew for the dependent variable.

# Correlation and distribution plots
ggpairs(
    abalone[, init_feat],
    aes(color = sex, alpha = 1),
    title = "Scatterplot and correlation for abalone dataset",
    lower = list(combo = wrap("facethist", binwidth = 5)),
    upper = list(continuous = wrap(ggally_cor, size = 2))
) +
theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = -90, vjust = 0.5),
    strip.text.x = element_text(size = 6),
    strip.text.y = element_text(size = 5)
)

We recall from the introduction that biologists believe there is a linear dependence between the height of abalones and their age. From the last line of the plot, we observe that there is indeed a correlation between large height values and large numbers of rings, however a simple linear dependence does not seem clear.

Secondly, there are very high correlations between the different explicative variables. For instance, there is a correlation of 0.987 between diameter and length, which can make the explanation of our models more difficult.

In addition, the separation between infants and adults is clear in almost all plots. However, distributions of variables for both male and female abalones are very similar. This indicates that the major difference is between infants and adults and might be a better variable for modeling.

Simple linear model

The simple linear model can be described as follows, where \(Y\) is associated to rings and \(X\) is associated to height and intercept: \(Y = X \beta + \epsilon\). Under the normal error assumption, we have to check if the following properties are indeed verified:

[P1] Errors are centered: \(\forall i = 1..n, \mathbb{E}_{\beta}(\epsilon_i) = 0\).

Possible assessment:

  • The mean curve in the Residuals vs. Fitted plot should be close to zero and straight

[P2] Errors have homoscedastic variance: \(\forall i = 1..n, \text{Var}_{\beta}(\epsilon_i) = \sigma^2\).

Possible assessments:

  • The Scale-Location plot shows the repartition of residuals among observations, which should be uniform

  • The Breush-Pagan test allows to assess the \(\mathcal{H}_0\) hypothesis of homoscedasticity, which is rejected if the \(p\)-value is smaller than 0.05

In particular, a square or log transformation of the dependent variable \(Y\) might improve the model in case the homoscedastic assumption is rejected.

[P3] Errors are uncorrelated: \(\forall i \neq j, \text{Cov}(\epsilon_i, \epsilon_j) = 0\).

Possible assessments:

  • The auto-correlation function should not exceed the confidence interval around 0

  • The Durbin-Watson test allows to assess the \(\mathcal{H}_0\) hypothesis of uncorrelation, which is rejected if the \(p\)-value is smaller than 0.05

[P4] Errors are gaussian: \(\forall i = 1..n, \epsilon_i \hookrightarrow \mathbb{N}(0, \sigma^2)\).

Possible assessments:

  • The Q-Q plot shows the comparison between the quantiles of the standardized residuals and a true normal distribution, which should be close enough

  • The Shapiro-Wilk test allows to assess the \(\mathcal{H}_0\) hypothesis of gaussianity, which is rejected if the \(p\)-value is smaller than 0.05

# Simple linear model using only height
lm_height <- lm(rings ~ height, data = abalone_train)

# Remove outliers
abalone_train$cook_dist <- cooks.distance(lm_height)
abalone_train <- subset(abalone_train, cook_dist < 0.1)
lm_height <- lm(rings ~ height, data = abalone_train)
plot(
    abalone_train$height,
    abalone_train$rings,
    ylab = "Rings",
    xlab = "Height",
    pch = 20,
    cex = 0.8,
    type = "p",
    main = "Confidence intervals for the simple linear model"
) +
matlines(
    abalone_train$height,
    predict(lm_height, interval = "prediction", level = 0.95),
    lty = c(1, 2, 2),
    col = c("red", "green", "green")
)

## integer(0)

From the fitted line and scatter plot, we can deduce that the linear model might not be sufficient to describe the relationship between height and rings variables.

First, we assess the initial assumptions for the linear regression of rings using height only.

autoplot(lm_height)

[P1] It is clear that, for higher values of rings, the residuals are negative in average. This shows that the model predicts too high values and indicates for a transformation of the dependent variable.

[P2] Below, we show the Breush-Pagan test’s \(p\)-value. The \(p\)-value is 2.22e-16, therefore we can reject \(\mathcal{H}_0\). In addition, we see from the plot that the residuals are clearly not equally spread.

# Breush-Pagan test
ncvTest(lm_height)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 230.414, Df = 1, p = < 2.22e-16

[P3] Below, we show the \(auto-correlation\) plot as well as the Durbin-Watson test’s \(p\)-value. The \(p\)-value is negligeable, therefore we can reject \(\mathcal{H}_0\). Moreover, the auto-correlation function of residuals is clearly not close to zero.

# Auto-correlation function
acf(lm_height$residuals, main = "Auto-correlation function of residuals")

# Durbin-Watson test
durbinWatsonTest(lm_height)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.4600701      1.079837       0
##  Alternative hypothesis: rho != 0

[P4] Below, we show the Shapiro-Wilk test’s \(p\)-value. The \(p\)-value is negligeable, therefore we can reject \(\mathcal{H}_0\). In addition, the plot shows that there is a clear deviation from a normal distribution for higher quantiles.

# Shapiro-Wilk test
shapiro.test(lm_height$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_height$residuals
## W = 0.88346, p-value < 2.2e-16

All in all, from our assumptions, only [P1] seems to be verified to a certain extent. All [P2], [P3] and [P4] are not verified. From our observations, a transformation of the dependent variable \(Y\) might improve our results. The fact that the predictions made by our model are larger than actual values, we will thus try the following transformation: \(\log(Y)\). The model is now: \(\log(Y) = X \beta + \epsilon\).

Linear model w. log transformation of \(Y\)

# Linear model using only height w. log transformation
lm_log_height <- lm(log_rings ~ height, data = abalone_train)

Finally, we perform our tests again, with our modified model.

autoplot(lm_log_height)

[P1] The results are not perfect, with an inversed U-shape. This might indicate a relationship with square of height.

[P2] Below, we show the Breush-Pagan test’s \(p\)-value. The plot is much better than for the simple linear model, with a red curve close to 1. Also, the \(p\)-value is 0.72724, which is larger than 0.05 and \(\mathcal{H}_0\) cannot be rejected.

# Breush-Pagan test
ncvTest(lm_log_height)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.4301489, Df = 1, p = 0.51192

[P3] Below, we show the \(auto-correlation\) plot as well as the Durbin-Watson test’s \(p\)-value. The auto-correlation is very similar to the simple linear model.

# Auto-correlation function
acf(lm_log_height$residuals, main = "Auto-correlation function of residuals")

# Durbin-Watson test
durbinWatsonTest(lm_log_height)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.4543776      1.091239       0
##  Alternative hypothesis: rho != 0

[P4] Below, we show the Shapiro-Wilk test’s \(p\)-value. The plot is much closer to the normal distribution with our log transformation of the dependent variable. However, the \(p\)-value of the test is still smaller than 0.05.

# Shapiro-Wilk test
shapiro.test(lm_log_height$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_log_height$residuals
## W = 0.97288, p-value < 2.2e-16

Therefore, we see that [P2] and [P4] are now verified, at least to a certain extent. With our new model, [P3] is still not verified.

We will now try out the following model: \(\log(Y) = X \beta + \epsilon\) where \(X\) contains the square of height plus height.

# Linear model using only height w. log transformation and square of height
lm_log_height2 <- lm(log_rings ~ height + height2, data = abalone_train)
autoplot(lm_log_height2)

[P1] The expectation of residuals is getting closer to zero again.

[P2] Below, we show the Breush-Pagan test’s \(p\)-value. Results are slightly better with this model.

# Breush-Pagan test
ncvTest(lm_log_height2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 1.963193, Df = 1, p = 0.16117

[P3] Below, we show the \(auto-correlation\) plot as well as the Durbin-Watson test’s \(p\)-value. Results are very similar to the model containing only the height.

# Auto-correlation function
acf(lm_log_height2$residuals, main = "Auto-correlation function of residuals")

# Durbin-Watson test
durbinWatsonTest(lm_log_height2)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.4687026      1.062583       0
##  Alternative hypothesis: rho != 0

[P4] Below, we show the Shapiro-Wilk test’s \(p\)-value. Again, results are very similar here, with a distance between true and predicted quantiles getting larger at the extremes.

# Shapiro-Wilk test
shapiro.test(lm_log_height2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_log_height2$residuals
## W = 0.96668, p-value < 2.2e-16

Regarding the relationship between rings and height, our simple model will be between the logarithm of rings and both height and squared height.

summary(lm_log_height2)
## 
## Call:
## lm(formula = log_rings ~ height + height2, data = abalone_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66632 -0.15549 -0.03705  0.12197  0.83776 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.917e-01  3.947e-02   25.12   <2e-16 ***
## height       6.658e-02  2.961e-03   22.48   <2e-16 ***
## height2     -7.179e-04  5.375e-05  -13.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2227 on 2917 degrees of freedom
## Multiple R-squared:  0.4914, Adjusted R-squared:  0.4911 
## F-statistic:  1409 on 2 and 2917 DF,  p-value: < 2.2e-16

We can finally compare our different models using ANOVA.

From the summary of our linear model, we observe that the \(p\)-value associated to the intercept, height and squared height are close to zero, thus we can say that there is a statistically significant relationship between log(rings) and height, squared height.

lm_log_intercept <- lm(log_rings ~ 1, data = abalone_train)

anova(lm_log_intercept, lm_log_height, lm_log_height2)
Res.Df RSS Df Sum of Sq F Pr(>F)
2919 284.4350 NA NA NA NA
2918 153.4985 1 130.93649 2640.3915 0
2917 144.6534 1 8.84504 178.3641 0
predict_log_error(lm_log_intercept, abalone_test)
## [1] 11.94409
predict_log_error(lm_log_height, abalone_test)
## [1] 7.277157
predict_log_error(lm_log_height2, abalone_test)
## [1] 6.901757

The ANOVA confirms that our linear model is better than the simple intercept value. Then, we want to add new features to our linear model, without losing too much accordance to the assumptions.

Multiple linear model

Below, we show a graph gathering different transformations of our features leading to a particularly good fit between features and \(log(**rings**)\).

# Correlation and distribution plots w. log transformation
log_cols <- c(
    "sex", "log_length", "log_diameter", "height", "height2", "log_whole_weight",
    "log_shucked_weight", "log_viscera_weight", "log_shell_weight",
    "log_rings"
)

ggpairs(
    abalone[, log_cols],
    aes(color = sex, alpha = 1),
    title = "Scatterplot and correlation for abalone dataset - log transf.",
    lower = list(combo = wrap("facethist", binwidth = 5)),
    upper = list(continuous = wrap(ggally_cor, size = 2))
) +
theme(
    plot.title = element_text(hjust = 0.5),
    axis.text.x = element_text(angle = -90, vjust = 0.5),
    strip.text.x = element_text(size = 6),
    strip.text.y = element_text(size = 5)
)

# Add log of some of the features
lm_log_multi <- lm(
    log_rings ~ log_length + height + height2 + log_shucked_weight + log_whole_weight +
    log_shell_weight + adulthood,
    data = abalone_train
)

# Remove outliers
abalone_train$cook_dist_multi <- cooks.distance(lm_log_multi)
abalone_train_multi <- subset(abalone_train, cook_dist_multi < 0.1)
lm_log_multi <- lm(
    log_rings ~ log_length + height + height2 + log_shucked_weight + log_whole_weight +
    log_shell_weight + adulthood,
    data = abalone_train_multi
)
autoplot(lm_log_multi)

[P1] The residuals are not equally distributed, but the mean is close to zero with a small U-shape.

[P2] Standardized residuals are equally distributed but the line is not straight. The \(p\)-value indicates that \(\mathcal{H}_0\) is rejected.

# Breush-Pagan test
ncvTest(lm_log_multi)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 18.01005, Df = 1, p = 2.1974e-05

[P3] Below, we show the \(auto-correlation\) plot as well as the Durbin-Watson test’s \(p\)-value. In terms of autocorrelation, the values are largely smaller than for our simple linear model but still above the confidence interval.

# Auto-correlation function
acf(lm_log_multi$residuals, main = "Auto-correlation function of residuals")

# Durbin-Watson test
durbinWatsonTest(lm_log_multi)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2777612      1.444462       0
##  Alternative hypothesis: rho != 0

[P4] Below, we show the Shapiro-Wilk test’s \(p\)-value. Again, results are very similar here, with a distance between true and predicted quantiles getting larger at the extremes.

# Shapiro-Wilk test
shapiro.test(lm_log_multi$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_log_multi$residuals
## W = 0.98798, p-value = 5.228e-15
summary(lm_log_multi)
## 
## Call:
## lm(formula = log_rings ~ log_length + height + height2 + log_shucked_weight + 
##     log_whole_weight + log_shell_weight + adulthood, data = abalone_train_multi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97466 -0.12543 -0.01457  0.10994  0.73062 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.887e+00  2.644e-01   7.135 1.22e-12 ***
## log_length         -3.974e-01  8.055e-02  -4.933 8.55e-07 ***
## height              1.783e-02  4.339e-03   4.110 4.07e-05 ***
## height2            -2.438e-04  6.099e-05  -3.998 6.55e-05 ***
## log_shucked_weight -6.568e-01  3.373e-02 -19.470  < 2e-16 ***
## log_whole_weight    6.700e-01  5.788e-02  11.576  < 2e-16 ***
## log_shell_weight    3.463e-01  3.099e-02  11.174  < 2e-16 ***
## adulthood           3.586e-02  9.345e-03   3.838 0.000127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1871 on 2909 degrees of freedom
## Multiple R-squared:  0.6411, Adjusted R-squared:  0.6403 
## F-statistic: 742.4 on 7 and 2909 DF,  p-value: < 2.2e-16
predict_log_error(lm_log_multi, abalone_test)
## [1] 4.935304

Finally, all the coefficients are significant in this model, and the final RSS on test data is below 5, which is way better than our simple linear models. Recall that for these models, the RSS was approx. 7.

Forward feature selection

In this section, we perform a forward feature selection. We use a model consisting of all known features and interaction terms with the adulthood feature. Then, the model performs a selection of the features based on results on the training set. The final best model is summed up below.

# Forward selection start from simplest model
abalone_train_copy <- data.frame(abalone_train)
lm_log_intercept <- lm(log_rings ~ 1, data = abalone_train_copy)

lm_complete <- lm(
    log_rings ~ adulthood + length + log_length + length * adulthood + log_length * adulthood +
    diameter + log_diameter + diameter * adulthood + log_diameter * adulthood + height + height2 +
    log_height + height * adulthood + height2 * adulthood + log_height * adulthood + shucked_weight
    + log_shucked_weight + shucked_weight * adulthood + log_shucked_weight * adulthood +
    viscera_weight + log_viscera_weight + viscera_weight * adulthood + log_viscera_weight *
    adulthood + whole_weight + log_whole_weight + whole_weight * adulthood + log_whole_weight *
    adulthood + shell_weight + log_shell_weight + shell_weight * adulthood + log_shell_weight *
    adulthood,
    data = abalone_train_copy
)

lm_fwd <- stepAIC(
    lm_log_intercept,
    scope = list(upper = lm_complete, lower = lm_log_intercept),
    trace = T,
    direction = c("forward"),
    data = abalone_train_copy
)
## Start:  AIC=-6798.2
## log_rings ~ 1
## 
##                      Df Sum of Sq    RSS     AIC
## + log_shell_weight    1   152.723 131.71 -9044.3
## + log_height          1   139.364 145.07 -8762.2
## + log_whole_weight    1   134.361 150.07 -8663.2
## + log_diameter        1   132.112 152.32 -8619.7
## + height              1   130.936 153.50 -8597.3
## + log_viscera_weight  1   127.612 156.82 -8534.7
## + log_length          1   125.889 158.55 -8502.8
## + shell_weight        1   124.962 159.47 -8485.8
## + diameter            1   123.684 160.75 -8462.5
## + length              1   117.330 167.10 -8349.3
## + height2             1   114.713 169.72 -8303.9
## + log_shucked_weight  1   109.053 175.38 -8208.1
## + whole_weight        1    99.243 185.19 -8049.2
## + viscera_weight      1    88.616 195.82 -7886.3
## + adulthood           1    68.089 216.35 -7595.2
## + shucked_weight      1    67.552 216.88 -7587.9
## <none>                            284.44 -6798.2
## 
## Step:  AIC=-9044.26
## log_rings ~ log_shell_weight
## 
##                      Df Sum of Sq    RSS     AIC
## + log_shucked_weight  1   21.5792 110.13 -9564.7
## + log_length          1   10.9514 120.76 -9295.7
## + shucked_weight      1   10.1196 121.59 -9275.7
## + log_whole_weight    1    9.8514 121.86 -9269.3
## + length              1    9.3369 122.38 -9257.0
## + log_diameter        1    6.7397 124.97 -9195.6
## + diameter            1    5.3552 126.36 -9163.5
## + log_viscera_weight  1    4.5812 127.13 -9145.6
## + viscera_weight      1    2.8648 128.85 -9106.5
## + whole_weight        1    2.2548 129.46 -9092.7
## + adulthood           1    2.1347 129.58 -9090.0
## + height2             1    0.2533 131.46 -9047.9
## + height              1    0.2378 131.47 -9047.5
## + shell_weight        1    0.1972 131.51 -9046.6
## + log_height          1    0.1082 131.60 -9044.7
## <none>                            131.71 -9044.3
## 
## Step:  AIC=-9564.74
## log_rings ~ log_shell_weight + log_shucked_weight
## 
##                      Df Sum of Sq    RSS     AIC
## + log_whole_weight    1    3.6097 106.52 -9660.0
## + adulthood           1    1.7858 108.35 -9610.5
## + log_height          1    1.0469 109.09 -9590.6
## + height              1    0.7046 109.43 -9581.5
## + length              1    0.4933 109.64 -9575.8
## + height2             1    0.3997 109.73 -9573.4
## + log_viscera_weight  1    0.2889 109.84 -9570.4
## + shucked_weight      1    0.2836 109.85 -9570.3
## + log_length          1    0.1921 109.94 -9567.8
## <none>                            110.13 -9564.7
## + viscera_weight      1    0.0662 110.07 -9564.5
## + diameter            1    0.0588 110.07 -9564.3
## + shell_weight        1    0.0338 110.10 -9563.6
## + log_diameter        1    0.0245 110.11 -9563.4
## + whole_weight        1    0.0007 110.13 -9562.8
## 
## Step:  AIC=-9660.05
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight
## 
##                      Df Sum of Sq    RSS     AIC
## + length              1   1.41177 105.11 -9697.0
## + adulthood           1   0.99475 105.53 -9685.4
## + viscera_weight      1   0.94088 105.58 -9684.0
## + log_length          1   0.74525 105.78 -9678.5
## + log_height          1   0.54779 105.98 -9673.1
## + shucked_weight      1   0.50435 106.02 -9671.9
## + diameter            1   0.44718 106.08 -9670.3
## + whole_weight        1   0.30302 106.22 -9666.4
## + height              1   0.23400 106.29 -9664.5
## + log_viscera_weight  1   0.22423 106.30 -9664.2
## + shell_weight        1   0.10697 106.42 -9661.0
## + height2             1   0.09523 106.43 -9660.7
## <none>                            106.52 -9660.0
## + log_diameter        1   0.03884 106.48 -9659.1
## 
## Step:  AIC=-9697
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length
## 
##                      Df Sum of Sq    RSS     AIC
## + adulthood           1   1.09538 104.02 -9725.6
## + height              1   0.65872 104.45 -9713.4
## + height2             1   0.56666 104.55 -9710.8
## + log_height          1   0.44716 104.66 -9707.5
## + diameter            1   0.21215 104.90 -9700.9
## + shell_weight        1   0.13692 104.97 -9698.8
## + viscera_weight      1   0.12519 104.99 -9698.5
## + log_viscera_weight  1   0.12504 104.99 -9698.5
## + log_diameter        1   0.09556 105.02 -9697.7
## <none>                            105.11 -9697.0
## + whole_weight        1   0.03015 105.08 -9695.8
## + log_length          1   0.00922 105.10 -9695.3
## + shucked_weight      1   0.00139 105.11 -9695.0
## 
## Step:  AIC=-9725.59
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood
## 
##                                Df Sum of Sq    RSS     AIC
## + adulthood:log_shucked_weight  1   0.63004 103.39 -9741.3
## + height                        1   0.51278 103.50 -9738.0
## + height2                       1   0.43664 103.58 -9735.9
## + adulthood:length              1   0.39871 103.62 -9734.8
## + log_height                    1   0.38709 103.63 -9734.5
## + viscera_weight                1   0.27270 103.74 -9731.3
## + adulthood:log_whole_weight    1   0.22499 103.79 -9729.9
## + log_viscera_weight            1   0.20233 103.81 -9729.3
## + log_diameter                  1   0.14683 103.87 -9727.7
## + diameter                      1   0.12452 103.89 -9727.1
## + adulthood:log_shell_weight    1   0.10560 103.91 -9726.6
## + shell_weight                  1   0.07501 103.94 -9725.7
## <none>                                      104.02 -9725.6
## + shucked_weight                1   0.04322 103.97 -9724.8
## + log_length                    1   0.01091 104.00 -9723.9
## + whole_weight                  1   0.00165 104.02 -9723.6
## 
## Step:  AIC=-9741.33
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + log_shucked_weight:adulthood
## 
##                              Df Sum of Sq    RSS     AIC
## + adulthood:log_whole_weight  1   2.26747 101.12 -9804.1
## + adulthood:log_shell_weight  1   1.45657 101.93 -9780.8
## + height2                     1   0.89060 102.50 -9764.6
## + height                      1   0.81884 102.57 -9762.6
## + shell_weight                1   0.59662 102.79 -9756.2
## + log_height                  1   0.35525 103.03 -9749.4
## + whole_weight                1   0.29528 103.09 -9747.7
## + diameter                    1   0.25492 103.13 -9746.5
## + log_viscera_weight          1   0.24456 103.14 -9746.2
## + log_length                  1   0.11767 103.27 -9742.7
## + adulthood:length            1   0.09123 103.30 -9741.9
## <none>                                    103.39 -9741.3
## + shucked_weight              1   0.05129 103.33 -9740.8
## + viscera_weight              1   0.04132 103.34 -9740.5
## + log_diameter                1   0.02601 103.36 -9740.1
## 
## Step:  AIC=-9804.09
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + log_shucked_weight:adulthood + log_whole_weight:adulthood
## 
##                              Df Sum of Sq    RSS     AIC
## + height                      1   0.62002 100.50 -9820.0
## + height2                     1   0.58281 100.54 -9819.0
## + adulthood:length            1   0.36334 100.75 -9812.6
## + log_height                  1   0.36034 100.76 -9812.5
## + log_viscera_weight          1   0.27581 100.84 -9810.1
## + viscera_weight              1   0.20701 100.91 -9808.1
## + diameter                    1   0.20124 100.92 -9807.9
## + shell_weight                1   0.16489 100.95 -9806.9
## + shucked_weight              1   0.10161 101.02 -9805.0
## + whole_weight                1   0.08502 101.03 -9804.5
## + log_diameter                1   0.07684 101.04 -9804.3
## <none>                                    101.12 -9804.1
## + log_length                  1   0.02451 101.09 -9802.8
## + adulthood:log_shell_weight  1   0.00260 101.12 -9802.2
## 
## Step:  AIC=-9820.05
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood
## 
##                              Df Sum of Sq     RSS     AIC
## + viscera_weight              1   0.55114  99.948 -9834.1
## + adulthood:height            1   0.47191 100.027 -9831.8
## + adulthood:length            1   0.33293 100.166 -9827.7
## + log_viscera_weight          1   0.32411 100.175 -9827.5
## + log_diameter                1   0.16123 100.337 -9822.7
## + diameter                    1   0.12667 100.372 -9821.7
## <none>                                    100.499 -9820.0
## + log_height                  1   0.05052 100.448 -9819.5
## + shell_weight                1   0.02092 100.478 -9818.7
## + log_length                  1   0.01099 100.488 -9818.4
## + adulthood:log_shell_weight  1   0.00785 100.491 -9818.3
## + shucked_weight              1   0.00497 100.494 -9818.2
## + whole_weight                1   0.00044 100.498 -9818.1
## + height2                     1   0.00000 100.499 -9818.0
## 
## Step:  AIC=-9834.1
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood
## 
##                              Df Sum of Sq    RSS     AIC
## + adulthood:viscera_weight    1   0.51136 99.436 -9847.1
## + adulthood:height            1   0.49789 99.450 -9846.7
## + whole_weight                1   0.47663 99.471 -9846.1
## + shucked_weight              1   0.45771 99.490 -9845.5
## + shell_weight                1   0.36600 99.582 -9842.8
## + adulthood:length            1   0.29652 99.651 -9840.8
## + log_height                  1   0.27172 99.676 -9840.1
## + diameter                    1   0.17764 99.770 -9837.3
## + log_length                  1   0.11912 99.828 -9835.6
## + height2                     1   0.10661 99.841 -9835.2
## <none>                                    99.948 -9834.1
## + log_viscera_weight          1   0.06428 99.883 -9834.0
## + log_diameter                1   0.03673 99.911 -9833.2
## + adulthood:log_shell_weight  1   0.03133 99.916 -9833.0
## 
## Step:  AIC=-9847.08
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood + adulthood:viscera_weight
## 
##                              Df Sum of Sq    RSS     AIC
## + shucked_weight              1   0.39330 99.043 -9856.7
## + whole_weight                1   0.37571 99.060 -9856.1
## + shell_weight                1   0.24664 99.190 -9852.3
## + adulthood:height            1   0.19693 99.239 -9850.9
## + diameter                    1   0.13053 99.306 -9848.9
## + log_height                  1   0.11222 99.324 -9848.4
## + height2                     1   0.10530 99.331 -9848.2
## + log_viscera_weight          1   0.10305 99.333 -9848.1
## + log_diameter                1   0.08902 99.347 -9847.7
## <none>                                    99.436 -9847.1
## + adulthood:log_shell_weight  1   0.06527 99.371 -9847.0
## + adulthood:length            1   0.05024 99.386 -9846.6
## + log_length                  1   0.00814 99.428 -9845.3
## 
## Step:  AIC=-9856.65
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_shucked_weight:adulthood + log_whole_weight:adulthood + 
##     adulthood:viscera_weight
## 
##                              Df Sum of Sq    RSS     AIC
## + log_diameter                1  0.240093 98.803 -9861.7
## + adulthood:height            1  0.218821 98.824 -9861.1
## + log_length                  1  0.137679 98.905 -9858.7
## + adulthood:log_shell_weight  1  0.092864 98.950 -9857.4
## + diameter                    1  0.087004 98.956 -9857.2
## <none>                                    99.043 -9856.7
## + adulthood:length            1  0.063794 98.979 -9856.5
## + shell_weight                1  0.015872 99.027 -9855.1
## + whole_weight                1  0.013325 99.030 -9855.0
## + adulthood:shucked_weight    1  0.009827 99.033 -9854.9
## + log_height                  1  0.008848 99.034 -9854.9
## + log_viscera_weight          1  0.003097 99.040 -9854.7
## + height2                     1  0.002985 99.040 -9854.7
## 
## Step:  AIC=-9861.74
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + log_shucked_weight:adulthood + log_whole_weight:adulthood + 
##     adulthood:viscera_weight
## 
##                              Df Sum of Sq    RSS     AIC
## + adulthood:height            1  0.206156 98.597 -9865.8
## + log_length                  1  0.084115 98.719 -9862.2
## + adulthood:log_shell_weight  1  0.073655 98.729 -9861.9
## + diameter                    1  0.069657 98.733 -9861.8
## <none>                                    98.803 -9861.7
## + adulthood:length            1  0.048769 98.754 -9861.2
## + shell_weight                1  0.045638 98.757 -9861.1
## + whole_weight                1  0.032058 98.771 -9860.7
## + adulthood:log_diameter      1  0.020706 98.782 -9860.4
## + log_height                  1  0.017740 98.785 -9860.3
## + height2                     1  0.005498 98.797 -9859.9
## + adulthood:shucked_weight    1  0.003891 98.799 -9859.9
## + log_viscera_weight          1  0.000292 98.803 -9859.7
## 
## Step:  AIC=-9865.84
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + log_shucked_weight:adulthood + log_whole_weight:adulthood + 
##     adulthood:viscera_weight + adulthood:height
## 
##                              Df Sum of Sq    RSS     AIC
## + height2                     1  0.106090 98.491 -9867.0
## + log_height                  1  0.105948 98.491 -9867.0
## + log_length                  1  0.097512 98.499 -9866.7
## + diameter                    1  0.072302 98.524 -9866.0
## + shell_weight                1  0.068065 98.529 -9865.9
## <none>                                    98.597 -9865.8
## + adulthood:length            1  0.040173 98.556 -9865.0
## + whole_weight                1  0.030510 98.566 -9864.7
## + adulthood:log_shell_weight  1  0.027505 98.569 -9864.7
## + adulthood:shucked_weight    1  0.013917 98.583 -9864.3
## + adulthood:log_diameter      1  0.013223 98.583 -9864.2
## + log_viscera_weight          1  0.000952 98.596 -9863.9
## 
## Step:  AIC=-9866.98
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + height2 + log_shucked_weight:adulthood + log_whole_weight:adulthood + 
##     adulthood:viscera_weight + adulthood:height
## 
##                              Df Sum of Sq    RSS     AIC
## + log_length                  1  0.121358 98.369 -9868.6
## + diameter                    1  0.104342 98.386 -9868.1
## <none>                                    98.491 -9867.0
## + adulthood:length            1  0.042168 98.448 -9866.2
## + adulthood:log_shell_weight  1  0.033987 98.457 -9866.0
## + shell_weight                1  0.020554 98.470 -9865.6
## + adulthood:shucked_weight    1  0.017564 98.473 -9865.5
## + log_height                  1  0.016410 98.474 -9865.5
## + adulthood:log_diameter      1  0.015865 98.475 -9865.5
## + log_viscera_weight          1  0.011706 98.479 -9865.3
## + adulthood:height2           1  0.006817 98.484 -9865.2
## + whole_weight                1  0.002102 98.488 -9865.0
## 
## Step:  AIC=-9868.58
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + height2 + log_length + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood + adulthood:viscera_weight + adulthood:height
## 
##                              Df Sum of Sq    RSS     AIC
## + shell_weight                1  0.092582 98.277 -9869.3
## <none>                                    98.369 -9868.6
## + log_height                  1  0.063535 98.306 -9868.5
## + whole_weight                1  0.030371 98.339 -9867.5
## + adulthood:log_shell_weight  1  0.019563 98.350 -9867.2
## + adulthood:height2           1  0.013355 98.356 -9867.0
## + diameter                    1  0.010651 98.359 -9866.9
## + adulthood:length            1  0.008610 98.361 -9866.8
## + adulthood:log_length        1  0.006724 98.362 -9866.8
## + adulthood:log_diameter      1  0.002353 98.367 -9866.7
## + adulthood:shucked_weight    1  0.002143 98.367 -9866.6
## + log_viscera_weight          1  0.000610 98.369 -9866.6
## 
## Step:  AIC=-9869.33
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + height2 + log_length + shell_weight + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood + adulthood:viscera_weight + adulthood:height
## 
##                              Df Sum of Sq    RSS     AIC
## + adulthood:shell_weight      1   0.41573 97.861 -9879.7
## + adulthood:log_shell_weight  1   0.08012 98.197 -9869.7
## + log_height                  1   0.07397 98.203 -9869.5
## <none>                                    98.277 -9869.3
## + diameter                    1   0.03428 98.242 -9868.4
## + adulthood:height2           1   0.01816 98.258 -9867.9
## + adulthood:length            1   0.00709 98.270 -9867.5
## + log_viscera_weight          1   0.00611 98.271 -9867.5
## + adulthood:log_diameter      1   0.00183 98.275 -9867.4
## + whole_weight                1   0.00105 98.276 -9867.4
## + adulthood:log_length        1   0.00091 98.276 -9867.4
## + adulthood:shucked_weight    1   0.00016 98.276 -9867.3
## 
## Step:  AIC=-9879.71
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + height2 + log_length + shell_weight + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood + adulthood:viscera_weight + adulthood:height + 
##     adulthood:shell_weight
## 
##                              Df Sum of Sq    RSS     AIC
## + adulthood:shucked_weight    1  0.136142 97.725 -9881.8
## <none>                                    97.861 -9879.7
## + diameter                    1  0.056737 97.804 -9879.4
## + log_viscera_weight          1  0.052624 97.808 -9879.3
## + adulthood:length            1  0.035295 97.826 -9878.8
## + adulthood:height2           1  0.027092 97.834 -9878.5
## + log_height                  1  0.022802 97.838 -9878.4
## + adulthood:log_length        1  0.012424 97.848 -9878.1
## + whole_weight                1  0.011433 97.849 -9878.1
## + adulthood:log_diameter      1  0.003469 97.857 -9877.8
## + adulthood:log_shell_weight  1  0.002907 97.858 -9877.8
## 
## Step:  AIC=-9881.78
## log_rings ~ log_shell_weight + log_shucked_weight + log_whole_weight + 
##     length + adulthood + height + viscera_weight + shucked_weight + 
##     log_diameter + height2 + log_length + shell_weight + log_shucked_weight:adulthood + 
##     log_whole_weight:adulthood + adulthood:viscera_weight + adulthood:height + 
##     adulthood:shell_weight + adulthood:shucked_weight
## 
##                              Df Sum of Sq    RSS     AIC
## <none>                                    97.725 -9881.8
## + adulthood:log_shell_weight  1  0.058082 97.667 -9881.5
## + log_height                  1  0.049722 97.675 -9881.3
## + whole_weight                1  0.043316 97.681 -9881.1
## + diameter                    1  0.039911 97.685 -9881.0
## + log_viscera_weight          1  0.032504 97.692 -9880.7
## + adulthood:log_length        1  0.022508 97.702 -9880.4
## + adulthood:length            1  0.021791 97.703 -9880.4
## + adulthood:log_diameter      1  0.012786 97.712 -9880.2
## + adulthood:height2           1  0.005870 97.719 -9880.0
summary(lm_fwd)
## 
## Call:
## lm(formula = log_rings ~ log_shell_weight + log_shucked_weight + 
##     log_whole_weight + length + adulthood + height + viscera_weight + 
##     shucked_weight + log_diameter + height2 + log_length + shell_weight + 
##     log_shucked_weight:adulthood + log_whole_weight:adulthood + 
##     adulthood:viscera_weight + adulthood:height + adulthood:shell_weight + 
##     adulthood:shucked_weight, data = abalone_train_copy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.02988 -0.12005 -0.01178  0.10326  0.70584 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.471e+00  7.329e-01  -2.007  0.04484 *  
## log_shell_weight              1.477e-01  4.962e-02   2.976  0.00295 ** 
## log_shucked_weight           -1.961e-01  6.428e-02  -3.052  0.00230 ** 
## log_whole_weight              1.288e-01  7.351e-02   1.753  0.07977 .  
## length                       -9.811e-03  2.232e-03  -4.396 1.14e-05 ***
## adulthood                    -8.179e-01  1.235e-01  -6.623 4.17e-11 ***
## height                        7.053e-03  5.938e-03   1.188  0.23498    
## viscera_weight               -1.725e-03  1.900e-03  -0.908  0.36404    
## shucked_weight               -9.193e-04  1.194e-03  -0.770  0.44157    
## log_diameter                  2.304e-01  8.970e-02   2.568  0.01026 *  
## height2                       5.367e-05  1.081e-04   0.496  0.61974    
## log_length                    6.234e-01  2.260e-01   2.758  0.00585 ** 
## shell_weight                  7.797e-03  1.788e-03   4.361 1.34e-05 ***
## log_shucked_weight:adulthood -7.252e-01  8.446e-02  -8.586  < 2e-16 ***
## log_whole_weight:adulthood    8.471e-01  9.114e-02   9.294  < 2e-16 ***
## adulthood:viscera_weight     -1.789e-03  1.970e-03  -0.908  0.36394    
## adulthood:height             -6.366e-03  3.625e-03  -1.756  0.07919 .  
## adulthood:shell_weight       -6.337e-03  1.566e-03  -4.047 5.32e-05 ***
## adulthood:shucked_weight      2.375e-03  1.182e-03   2.010  0.04449 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1835 on 2901 degrees of freedom
## Multiple R-squared:  0.6564, Adjusted R-squared:  0.6543 
## F-statistic: 307.9 on 18 and 2901 DF,  p-value: < 2.2e-16
predict_log_error(lm_fwd, abalone_test)
## [1] 4.847444

This method allowed us to diminish the RSS even further. Below, we present the best model in terms of RSS that we manually computed.

best_lm_log_multi <- lm(
    log_rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train
)

predict_log_error(best_lm_log_multi, abalone_test)
## [1] 4.758786
summary(best_lm_log_multi)
## 
## Call:
## lm(formula = log_rings ~ length * adulthood + log_length + height + 
##     height * adulthood + shucked_weight * adulthood + log_shucked_weight * 
##     adulthood + viscera_weight + log_whole_weight * adulthood + 
##     whole_weight * adulthood + log_shell_weight, data = abalone_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04217 -0.11950 -0.01285  0.10397  0.70897 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -4.261e-01  7.422e-01  -0.574 0.565951    
## length                       -7.462e-03  3.018e-03  -2.473 0.013471 *  
## adulthood                    -9.111e-01  1.496e-01  -6.091 1.27e-09 ***
## log_length                    5.556e-01  2.439e-01   2.278 0.022823 *  
## height                        1.042e-02  2.810e-03   3.708 0.000213 ***
## shucked_weight               -5.061e-03  2.033e-03  -2.490 0.012833 *  
## log_shucked_weight           -8.864e-02  8.212e-02  -1.079 0.280501    
## viscera_weight               -3.485e-03  6.223e-04  -5.600 2.34e-08 ***
## log_whole_weight             -5.013e-02  9.508e-02  -0.527 0.598061    
## whole_weight                  4.135e-03  1.008e-03   4.103 4.19e-05 ***
## log_shell_weight              2.766e-01  3.077e-02   8.988  < 2e-16 ***
## length:adulthood             -9.725e-05  1.675e-03  -0.058 0.953697    
## adulthood:height             -6.236e-03  3.093e-03  -2.016 0.043884 *  
## adulthood:shucked_weight      6.567e-03  2.171e-03   3.024 0.002512 ** 
## adulthood:log_shucked_weight -8.209e-01  1.115e-01  -7.362 2.35e-13 ***
## adulthood:log_whole_weight    9.429e-01  1.231e-01   7.663 2.46e-14 ***
## adulthood:whole_weight       -3.942e-03  1.034e-03  -3.813 0.000140 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1839 on 2903 degrees of freedom
## Multiple R-squared:  0.6547, Adjusted R-squared:  0.6528 
## F-statistic: 344.1 on 16 and 2903 DF,  p-value: < 2.2e-16
autoplot(best_lm_log_multi)

ncvTest(best_lm_log_multi)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 35.0798, Df = 1, p = 3.1647e-09
durbinWatsonTest(best_lm_log_multi)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2732433      1.453461       0
##  Alternative hypothesis: rho != 0
shapiro.test(best_lm_log_multi$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  best_lm_log_multi$residuals
## W = 0.98828, p-value = 8.632e-15
acf(best_lm_log_multi$residuals, main = "Auto-correlation function of residuals")

We can see that assumptions [P2], [P3] and [P4] are no longer satisfied.

Part III: Model Selection and Prediction

In this section, we compare our multiple linear model with some well-known machine learning models. The comparison is mainly done thanks to the RSS values.

Random Forest

Random Forest is a well-known classification and regression algorithm. It is based on the average of multiple decision trees which are built using randomly selected features and samples respectively at each tree and node.

# Random Forest with all initial features
set.seed(42)

rf_init <- randomForest(
    rings ~ .,
    data = abalone_train[, init_feat],
    importance = TRUE
)

summary(rf_init)
##                 Length Class  Mode     
## call               4   -none- call     
## type               1   -none- character
## predicted       2920   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       2920   -none- numeric  
## importance        16   -none- numeric  
## importanceSD       8   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               2920   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
predict_error(rf_init, abalone_test)
## [1] 5.095847
# Random Forest with all computed features
set.seed(42)

rf_complete <- randomForest(
    rings ~ adulthood + length + log_length + length * adulthood + log_length * adulthood +
    diameter + log_diameter + diameter * adulthood + log_diameter * adulthood + height + height2 +
    log_height + height * adulthood + height2 * adulthood + log_height * adulthood + shucked_weight
    + log_shucked_weight + shucked_weight * adulthood + log_shucked_weight * adulthood +
    viscera_weight + log_viscera_weight + viscera_weight * adulthood + log_viscera_weight *
    adulthood + whole_weight + log_whole_weight + whole_weight * adulthood + log_whole_weight *
    adulthood + shell_weight + log_shell_weight + shell_weight * adulthood + log_shell_weight *
    adulthood,
    data = abalone_train,
    importance = TRUE
)

summary(rf_complete)
##                 Length Class  Mode     
## call               4   -none- call     
## type               1   -none- character
## predicted       2920   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       2920   -none- numeric  
## importance        32   -none- numeric  
## importanceSD      16   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               2920   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
predict_error(rf_complete, abalone_test)
## [1] 5.169329
# Random Forest with selected features
set.seed(42)

rf_multi <- randomForest(
    rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train,
    importance = TRUE
)

summary(rf_multi)
##                 Length Class  Mode     
## call               4   -none- call     
## type               1   -none- character
## predicted       2920   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       2920   -none- numeric  
## importance        20   -none- numeric  
## importanceSD      10   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               2920   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
predict_error(rf_multi, abalone_test)
## [1] 4.966454
# Random Forest with selected features + grid search
gridsearch_rf <- expand.grid(
    .mtry = c(2:5)  # or sqrt(length(10))
)

params <- expand.grid(
    ntrees = c(500, 1000),
    nodesize = c(1, 5)
)

control_rf <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42),
    verboseIter = TRUE,
)

store_maxnode <- vector("list", nrow(params))

for (i in seq_len(nrow(params))) {

    nodesize <- params[i, 2]
    ntree <- params[i, 1]

    set.seed(42)

    rf_grid <- train(
        rings ~ length * adulthood + log_length + height + height * adulthood +
        shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
        log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
        data = abalone_train,
        method = "rf",
        importance = TRUE,
        tuneGrid = gridsearch_rf,
        trControl = control_rf,
        ntree = ntree,
        nodesize = nodesize,
        verbosity = 0
    )

    store_maxnode[[i]] <- rf_grid

    print(predict_error(rf_grid, abalone_test))

}
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold1: mtry=4 
## - Fold1: mtry=4 
## + Fold1: mtry=5 
## - Fold1: mtry=5 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold2: mtry=4 
## - Fold2: mtry=4 
## + Fold2: mtry=5 
## - Fold2: mtry=5 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold3: mtry=4 
## - Fold3: mtry=4 
## + Fold3: mtry=5 
## - Fold3: mtry=5 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold4: mtry=4 
## - Fold4: mtry=4 
## + Fold4: mtry=5 
## - Fold4: mtry=5 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## + Fold5: mtry=4 
## - Fold5: mtry=4 
## + Fold5: mtry=5 
## - Fold5: mtry=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 5 on full training set
## [1] 5.088658
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold1: mtry=4 
## - Fold1: mtry=4 
## + Fold1: mtry=5 
## - Fold1: mtry=5 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold2: mtry=4 
## - Fold2: mtry=4 
## + Fold2: mtry=5 
## - Fold2: mtry=5 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold3: mtry=4 
## - Fold3: mtry=4 
## + Fold3: mtry=5 
## - Fold3: mtry=5 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold4: mtry=4 
## - Fold4: mtry=4 
## + Fold4: mtry=5 
## - Fold4: mtry=5 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## + Fold5: mtry=4 
## - Fold5: mtry=4 
## + Fold5: mtry=5 
## - Fold5: mtry=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 5 on full training set
## [1] 5.065495
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold1: mtry=4 
## - Fold1: mtry=4 
## + Fold1: mtry=5 
## - Fold1: mtry=5 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold2: mtry=4 
## - Fold2: mtry=4 
## + Fold2: mtry=5 
## - Fold2: mtry=5 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold3: mtry=4 
## - Fold3: mtry=4 
## + Fold3: mtry=5 
## - Fold3: mtry=5 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold4: mtry=4 
## - Fold4: mtry=4 
## + Fold4: mtry=5 
## - Fold4: mtry=5 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## + Fold5: mtry=4 
## - Fold5: mtry=4 
## + Fold5: mtry=5 
## - Fold5: mtry=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3 on full training set
## [1] 5.049521
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=3 
## - Fold1: mtry=3 
## + Fold1: mtry=4 
## - Fold1: mtry=4 
## + Fold1: mtry=5 
## - Fold1: mtry=5 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=3 
## - Fold2: mtry=3 
## + Fold2: mtry=4 
## - Fold2: mtry=4 
## + Fold2: mtry=5 
## - Fold2: mtry=5 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=3 
## - Fold3: mtry=3 
## + Fold3: mtry=4 
## - Fold3: mtry=4 
## + Fold3: mtry=5 
## - Fold3: mtry=5 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=3 
## - Fold4: mtry=3 
## + Fold4: mtry=4 
## - Fold4: mtry=4 
## + Fold4: mtry=5 
## - Fold4: mtry=5 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=3 
## - Fold5: mtry=3 
## + Fold5: mtry=4 
## - Fold5: mtry=4 
## + Fold5: mtry=5 
## - Fold5: mtry=5 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 3 on full training set
## [1] 5.092652

None of the Random Forest models outperform our multiple linear model in terms of RSS, despite going below 5 for the multiple Random Forest model.

XGBoost

XGBoost is also based on decision trees. However, these trees are built sequentially on the errors of the past trees. The final model is a combination of all the trees.

# XGBoost with all initial features
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

xgb_init <- train(
    rings ~.,
    data = abalone_train[, init_feat],
    method = "xgbTree",
    trControl = ctrl,
    verbosity = 0
)

summary(xgb_init)
##               Length Class              Mode       
## handle            1  xgb.Booster.handle externalptr
## raw           61129  -none-             raw        
## niter             1  -none-             numeric    
## call              6  -none-             call       
## params            9  -none-             list       
## callbacks         1  -none-             list       
## feature_names     9  -none-             character  
## nfeatures         1  -none-             numeric    
## xNames            9  -none-             character  
## problemType       1  -none-             character  
## tuneValue         7  data.frame         list       
## obsLevels         1  -none-             logical    
## param             1  -none-             list
predict_error(xgb_init, abalone_test)
## [1] 5.259585
# XGBoost with all computed features
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

xgb_complete <- train(
    rings ~ adulthood + length + log_length + length * adulthood + log_length * adulthood +
    diameter + log_diameter + diameter * adulthood + log_diameter * adulthood + height + height2 +
    log_height + height * adulthood + height2 * adulthood + log_height * adulthood + shucked_weight
    + log_shucked_weight + shucked_weight * adulthood + log_shucked_weight * adulthood +
    viscera_weight + log_viscera_weight + viscera_weight * adulthood + log_viscera_weight *
    adulthood + whole_weight + log_whole_weight + whole_weight * adulthood + log_whole_weight *
    adulthood + shell_weight + log_shell_weight + shell_weight * adulthood + log_shell_weight *
    adulthood,
    data = abalone_train,
    method = "xgbTree",
    trControl = ctrl,
    verbosity = 0
)

summary(xgb_complete)
##               Length Class              Mode       
## handle            1  xgb.Booster.handle externalptr
## raw           40179  -none-             raw        
## niter             1  -none-             numeric    
## call              6  -none-             call       
## params            9  -none-             list       
## callbacks         1  -none-             list       
## feature_names    31  -none-             character  
## nfeatures         1  -none-             numeric    
## xNames           31  -none-             character  
## problemType       1  -none-             character  
## tuneValue         7  data.frame         list       
## obsLevels         1  -none-             logical    
## param             1  -none-             list
predict_error(xgb_complete, abalone_test)
## [1] 5.145367
# XGBoost with selected features
set.seed(42)

ctrl = trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

xgb_multi <- train(
    rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train,
    method = "xgbTree",
    trControl = ctrl,
    verbosity = 0
)

summary(xgb_multi)
##               Length Class              Mode       
## handle            1  xgb.Booster.handle externalptr
## raw           78152  -none-             raw        
## niter             1  -none-             numeric    
## call              6  -none-             call       
## params            9  -none-             list       
## callbacks         1  -none-             list       
## feature_names    16  -none-             character  
## nfeatures         1  -none-             numeric    
## xNames           16  -none-             character  
## problemType       1  -none-             character  
## tuneValue         7  data.frame         list       
## obsLevels         1  -none-             logical    
## param             1  -none-             list
predict_error(xgb_multi, abalone_test)
## [1] 5.159744
# XGBoost with selected features + grid search
set.seed(42)

gridsearch_xgb <- expand.grid(
    eta = c(0.1, 0.3, 0.5),
    max_depth = c(3, 5, 7),
    min_child_weight = 1,
    subsample = 0.8,
    colsample_bytree = 0.8,
    nrounds = (1:10) * 200,
    gamma = 0
)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42),
    verboseIter = TRUE,
)

xgb_grid <- train(
    rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train,
    method = "xgbTree",
    tuneGrid = gridsearch_xgb,
    trControl = ctrl,
    verbosity = 0
)
## + Fold1: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold1: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold1: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold2: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold2: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold3: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold3: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold4: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold4: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.1, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.1, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.1, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.3, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.3, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.3, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.5, max_depth=3, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.5, max_depth=5, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## + Fold5: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## - Fold5: eta=0.5, max_depth=7, gamma=0, colsample_bytree=0.8, min_child_weight=1, subsample=0.8, nrounds=2000 
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 200, max_depth = 3, eta = 0.1, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
summary(xgb_grid)
##               Length Class              Mode       
## handle             1 xgb.Booster.handle externalptr
## raw           236979 -none-             raw        
## niter              1 -none-             numeric    
## call               6 -none-             call       
## params             9 -none-             list       
## callbacks          1 -none-             list       
## feature_names     16 -none-             character  
## nfeatures          1 -none-             numeric    
## xNames            16 -none-             character  
## problemType        1 -none-             character  
## tuneValue          7 data.frame         list       
## obsLevels          1 -none-             logical    
## param              1 -none-             list
predict_error(xgb_grid, abalone_test)
## [1] 5.000799

None of the XGBoost models outperform our multiple linear model in terms of RSS.

Kernel density estimators

Kernel density estimators are non-parametric methods which attempt to estimate the probability density function of a variable, depending on the chosen kernel and bandwith.

kernel_init <- ksmooth(
    x = abalone_train$height,
    y = abalone_train$rings,
    kernel = "normal",
    bandwidth = 1,
    x.points = abalone_test$height
)

plot(
    abalone_train$height,
    abalone_train$rings,
    ylab = "Rings",
    xlab = "Height",
    pch = 20,
    cex = 0.8,
    type = "p",
    main = "Predicted density from kernel estimator"
) +
lines(
    kernel_init,
    lwd = 3,
    col = "limegreen"
)

## integer(0)
y_pred <- kernel_init$y
y_pred[is.na(y_pred)] <- mean(y_pred, na.rm = TRUE)

print(residual_mean_of_squares(abalone_test$rings, y_pred))
## [1] 14.23709

The following function allows to find the best bandwith thanks to a grid search.

set.seed(42)

n <- length(abalone_train$height)
n_cv <- 100
k <- 5
cv_lab <- sample(n, n, replace = F) %% k
h_seq <- seq(0.1, 5, by = 0.01)

cv_err_h <- rep(0, length(h_seq))

for (i_tmp in 1:n_cv) {

    print(i_tmp)

    cv_err_h_tmp <- rep(0, length(h_seq))
    cv_lab <- sample(n, n, replace = F) %% k

    for (i in seq_len(length(h_seq))) {

        h0 <- h_seq[i]
        cv_err <- 0

        for (i_cv in 1:k) {

            w_val <- which(cv_lab == (i_cv - 1))

            x_tr <- abalone_train$height[-w_val]
            y_tr <- abalone_train$rings[-w_val]
            x_val <- abalone_train$height[w_val]
            y_val <- abalone_train$rings[w_val]

            kernel_reg <- ksmooth(
                x = x_tr,
                y = y_tr,
                kernel = "normal",
                bandwidth = h0,
                x.points = x_val
            )

            cv_err <- cv_err + mean((y_val[order(x_val)] - kernel_reg$y)^2, na.rm = T)

        }

        cv_err_h_tmp[i] <- cv_err / k

    }

    cv_err_h <- cv_err_h + cv_err_h_tmp

}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
## [1] 70
## [1] 71
## [1] 72
## [1] 73
## [1] 74
## [1] 75
## [1] 76
## [1] 77
## [1] 78
## [1] 79
## [1] 80
## [1] 81
## [1] 82
## [1] 83
## [1] 84
## [1] 85
## [1] 86
## [1] 87
## [1] 88
## [1] 89
## [1] 90
## [1] 91
## [1] 92
## [1] 93
## [1] 94
## [1] 95
## [1] 96
## [1] 97
## [1] 98
## [1] 99
## [1] 100
cv_err_h <- cv_err_h / n_cv

plot(
    h_seq,
    cv_err_h,
    xlab = "Smoothing Bandwidth",
    ylab = "CV Error",
    type = "l",
    lwd = 4,
    col = "blue"
)

# Optimal density estimator
# h_opt_final <- h_seq[which(cv_err_h == min(cv_err_h))]
h_opt_final <- 4.52

kernel_init <- ksmooth(
    x = abalone_train$height,
    y = abalone_train$rings,
    kernel = "normal",
    bandwidth = h_opt_final,
    x.points = abalone_test$height
)

plot(
    abalone_train$height,
    abalone_train$rings,
    ylab = "Rings",
    xlab = "Height",
    pch = 20,
    cex = 0.8,
    type = "p",
    main = "Predicted density from kernel estimator"
) +
lines(
    kernel_init,
    lwd = 3,
    col = "limegreen"
)

## integer(0)
y_pred <- kernel_init$y
y_pred[is.na(y_pred)] <- mean(y_pred, na.rm = TRUE)

print(residual_mean_of_squares(abalone_test$rings, y_pred))
## [1] 14.10033

None of the kernel estimators outperform our multiple linear model in terms of RSS.

GLMNet

# glmnet with all initial features
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

glm_init <- train(
    rings ~ .,
    data = abalone_train[, init_feat],
    method = "glmnet",
    trControl = ctrl,
    verbosity = 0
)

summary(glm_init)
##             Length Class      Mode     
## a0           87    -none-     numeric  
## beta        783    dgCMatrix  S4       
## df           87    -none-     numeric  
## dim           2    -none-     numeric  
## lambda       87    -none-     numeric  
## dev.ratio    87    -none-     numeric  
## nulldev       1    -none-     numeric  
## npasses       1    -none-     numeric  
## jerr          1    -none-     numeric  
## offset        1    -none-     logical  
## call          6    -none-     call     
## nobs          1    -none-     numeric  
## lambdaOpt     1    -none-     numeric  
## xNames        9    -none-     character
## problemType   1    -none-     character
## tuneValue     2    data.frame list     
## obsLevels     1    -none-     logical  
## param         1    -none-     list
predict_error(glm_init, abalone_test)
## [1] 5.158147
# glmnet with all computed features
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

glm_complete <- train(
    rings ~ adulthood + length + log_length + length * adulthood + log_length * adulthood +
    diameter + log_diameter + diameter * adulthood + log_diameter * adulthood + height + height2 +
    log_height + height * adulthood + height2 * adulthood + log_height * adulthood + shucked_weight
    + log_shucked_weight + shucked_weight * adulthood + log_shucked_weight * adulthood +
    viscera_weight + log_viscera_weight + viscera_weight * adulthood + log_viscera_weight *
    adulthood + whole_weight + log_whole_weight + whole_weight * adulthood + log_whole_weight *
    adulthood + shell_weight + log_shell_weight + shell_weight * adulthood + log_shell_weight *
    adulthood,
    data = abalone_train,
    method = "glmnet",
    trControl = ctrl,
    verbosity = 0
)

summary(glm_complete)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        3100   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           6   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        31   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          1   -none-     list
predict_error(glm_complete, abalone_test)
## [1] 4.824281
# glmnet with all computed features(log)
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

glm_log_complete <- train(
    log_rings ~ adulthood + length + log_length + length * adulthood + log_length * adulthood +
    diameter + log_diameter + diameter * adulthood + log_diameter * adulthood + height + height2 +
    log_height + height * adulthood + height2 * adulthood + log_height * adulthood + shucked_weight
    + log_shucked_weight + shucked_weight * adulthood + log_shucked_weight * adulthood +
    viscera_weight + log_viscera_weight + viscera_weight * adulthood + log_viscera_weight *
    adulthood + whole_weight + log_whole_weight + whole_weight * adulthood + log_whole_weight *
    adulthood + shell_weight + log_shell_weight + shell_weight * adulthood + log_shell_weight *
    adulthood,
    data = abalone_train,
    method = "glmnet",
    trControl = ctrl,
    verbosity = 0
)

summary(glm_log_complete)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        3100   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           6   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        31   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          1   -none-     list
predict_log_error(glm_log_complete, abalone_test)
## [1] 4.909744
# glmnet with selected features
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

glm_multi <- train(
    rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train,
    method = "glmnet",
    trControl = ctrl,
    verbosity = 0
)

summary(glm_multi)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        1600   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           6   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        16   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          1   -none-     list
predict_error(glm_multi, abalone_test)
## [1] 4.870607
# glmnet with selected features and log transf. of rings
set.seed(42)

ctrl <- trainControl(
    method = "cv",
    number = 5,
    seeds = set.seed(42)
)

glm_log_multi <- train(
    log_rings ~ length * adulthood + log_length + height + height * adulthood +
    shucked_weight * adulthood + log_shucked_weight * adulthood + viscera_weight +
    log_whole_weight * adulthood + whole_weight * adulthood + log_shell_weight,
    data = abalone_train,
    method = "glmnet",
    trControl = ctrl,
    verbosity = 0
)

summary(glm_log_multi)
##             Length Class      Mode     
## a0           100   -none-     numeric  
## beta        1600   dgCMatrix  S4       
## df           100   -none-     numeric  
## dim            2   -none-     numeric  
## lambda       100   -none-     numeric  
## dev.ratio    100   -none-     numeric  
## nulldev        1   -none-     numeric  
## npasses        1   -none-     numeric  
## jerr           1   -none-     numeric  
## offset         1   -none-     logical  
## call           6   -none-     call     
## nobs           1   -none-     numeric  
## lambdaOpt      1   -none-     numeric  
## xNames        16   -none-     character
## problemType    1   -none-     character
## tuneValue      2   data.frame list     
## obsLevels      1   -none-     logical  
## param          1   -none-     list
predict_log_error(glm_log_multi, abalone_test)
## [1] 4.915335